#====# Prepares datasets for estimation window phase #====#

# Setup and import data ----
library(tidyverse)
library(tidylog, warn.conflicts = FALSE)
library(data.table)
library(countrycode)
library(bizdays)
library(MatchIt)
library(WeightIt)
library(cobalt)

# setup business calendar:
business_calendar <- create.calendar('biz_calendar', weekdays = c('saturday','sunday'))

stocks <- read_rds("data/stocks_compustat.rds") # daily compustat stocks, downloaded from WRDS (compustat)
covars <- read_rds("data/covars_compustat.rds") # quarterly compustat covars, downloaded from WRDS (compustat)
indexes <- read_rds("data/SPindex_const.rds") # S&P 500 index constitutents, downloaded from WRDS (compustat)
subsid <- read_rds("data/WRDS_subsidiary.rds") # subsidiary information, downloaded from WRDS subsidiary
sp500_index <- read_rds("data/SP500_index.rds") # aggregate S&P 500 index, downloaded from Yahoo! Finance R API
fcpa <- read_rds("data/FCPA.rds") # FCPA data, retrieved from FCPA Clearinghouse
length(unique(fcpa$ticker_symbol)) # 286 unique firms

# Clean stocks data ----
## Drop pennystocks ----
# remove pennystock. Drop any firm that was trading at less than $1, on average, in December 2024
drop <- stocks %>%
  group_by(ticker_symbol) %>%
  filter(date <= as.Date("2024-12-31") &
           date >= as.Date("2024-12-01")) %>%
  reframe(avg_close = mean(prccd, na.rm = TRUE)) %>%
  filter(avg_close <= 1) %>%
  select(ticker_symbol) %>%
  pull()

stocks <- stocks %>%
  filter(!ticker_symbol %in% drop)

# is any past FCPA target in "drop"?
sum(fcpa$ticker_symbol %in% drop) # 1 of the past FCPA targets trade pennystock, who's that?
fcpa$conm[fcpa$ticker_symbol %in% drop] # Corsa Coal Corporation

## Compute returns ----
stocks <- stocks %>%
  arrange(ticker_symbol, date) %>%
  group_by(ticker_symbol) %>%
  mutate(chg = 100*(prccd - lag(prccd, 1))/lag(prccd, 1)) %>%
  ungroup() %>%
  relocate(chg, .after = prccd)
# note: returns are expressed as *percentage* changes already!

# note: we are already only considering business days:
stocks %>%
  filter(!is.bizday(date)) # that's correct.

## Separate samples ----
### S&P 500 constituents ----
sp500 <- stocks %>%
  filter(ticker_symbol %in% indexes$ticker_symbol)

length(unique(sp500$ticker_symbol)) # 503 
length(unique(sp500$conm)) # 500 
length(unique(sp500$gvkey)) # 500 

#### Fix multiple listing ----
# clean problem of multiple listing
sp500 %>%
  group_by(gvkey) %>%
  select(conm, ticker_symbol) %>%
  distinct() %>%
  filter(n() > 1)
# Fox, News corp, and google. Fix this:
sp500 <- sp500 %>%
  filter(!ticker_symbol %in% c("NWSA", "FOXA", "GOOGL"))

length(unique(sp500$ticker_symbol)) # 500
length(unique(sp500$conm)) # 500
length(unique(sp500$gvkey)) # 500

### Firms subject to FCPA in the past ----
sample <- stocks %>%
  filter(ticker_symbol %in% fcpa$ticker_symbol)

length(unique(sample$ticker_symbol)) # 262
length(unique(sample$conm)) # 262
length(unique(sample$gvkey)) # 262

# what has remained out?
fcpa %>%
  filter(!ticker_symbol %in% sample$ticker_symbol) %>%
  select(conm, ticker_symbol, company_market) %>%
  distinct() %>%
  print(n = Inf)
# 24 firms

# Clean covariates ----
## Calculate average trading price per firm in December 2024 ----
covars <- covars %>%
  left_join(stocks %>%
              arrange(ticker_symbol, date) %>%
              filter(date >= as.Date("2024-12-01") &
                       date <= as.Date("2024-12-31")) %>%
              group_by(ticker_symbol) %>%
              reframe(avg_dec = mean(prccd, na.rm = TRUE)) %>%
              filter(!is.na(avg_dec)),
            by = c("ticker_symbol"))

## Clean subsidiary data ----
subsid <- subsid %>%
  mutate(year = year(fdate)) %>%
  relocate(year)

# notice:
subsid %>%
  filter(is.na(country_code)) # none missing, alright

summary(subsid$year) # we have info only up to 2022, they're not updating this anymore 

### Join with covariates ----
covars <- covars %>%
  left_join(subsid %>%
              group_by(gvkey) %>%
              # keep the latest available year:
              filter(year == max(year, na.rm = TRUE)) %>%
              reframe(n_subsid_nonUS = sum(country_code != "US"),
                      n_subsid_US = sum(country_code == "US"),
                      n_subsid = n(),
                      n_unique_iso2 = length(unique(country_code[country_code != "US"]))),
            by = "gvkey") %>%
  mutate(across(.cols = matches("^n\\_"),
                .fns = ~ifelse(is.na(.x), 0, .x)))

## Add binary for whether the firm is S&P 500 ----
covars <- covars %>%
  mutate(is_SP500 = as.numeric(ticker_symbol %in% unique(sp500$ticker_symbol)))

length(unique(sample$ticker_symbol[sample$ticker_symbol %in% sp500$ticker_symbol])) # 89 treated firms in S&P 500

## Clean outstanding shares info ----
# ALLIANZ has the wrong number of shares:
covars %>%
  filter(conm == "ALLIANZ SE") %>%
  select(conm, date, cshoq)
# it should be 385,919,400. It's reported as 10x larger. Fix it:

covars <- covars %>%
  mutate(cshoq = case_when(conm == "ALLIANZ SE" ~ cshoq/10,
                           TRUE ~ cshoq))

# Create a matched group of placebo firms (non-past FCPA defendants) ----
# pool of candidates:
match_candidates <- stocks %>%
  # make sure to drop past FCPA targets: 
  filter(!ticker_symbol %in% sample$ticker_symbol) %>%
  filter(!gvkey %in% sample$gvkey)

unique(match_candidates$ticker_symbol %in% sample$ticker_symbol) # correct
unique(match_candidates$ticker_symbol %in% sp500$ticker_symbol) # there are S&P 500 members!

unique(match_candidates$gvkey %in% sample$gvkey) # correct
unique(match_candidates$gvkey %in% sp500$gvkey)  # there are S&P 500 members!

# do firms in the FCPA sample have covariates?
table(unique(sample$gvkey) %in% covars$gvkey) # 245 do, 17 don't

# isolate the two groups of covariates:
sample_covars <- covars %>%
  filter(gvkey %in% sample$gvkey) %>%
  group_by(ticker_symbol) %>% 
  arrange(ticker_symbol, -fyearq, -fqtr) %>%
  filter(row_number() == 1) %>%
  ungroup()
length(unique(sample_covars$gvkey)) # 245 FCPA targets

match_candidates_covars <- covars %>%
  filter(gvkey %in% match_candidates$gvkey) %>%
  group_by(ticker_symbol) %>% 
  arrange(ticker_symbol, -fyearq, -fqtr) %>%
  filter(row_number() == 1) %>%
  # keep only firms trading on the exchanges of interest
  filter(exchg %in% unique(sample_covars$exchg)) %>%
  ungroup()
length(unique(match_candidates_covars$gvkey)) # 6,796 possible non-FCPA targets 

matching <- sample_covars %>%
  mutate(FCPA_sample = 1) %>%
  rbind(match_candidates_covars %>%
          mutate(FCPA_sample = 0))

## Matching ----
vars <- c("exchg", # exchange
          "atq", # assets - total
          "avg_dec", # average december 2024 close price
          "naics2", # NAICS-2 code
          "cshoq", # common shares outstanding
          "n_unique_iso2", # number of foreign countries of activity
          "n_subsid_nonUS", # number of non-US subsidiaries
          "n_subsid", # tot number of subsidiaries
          "n_subsid_US", # number of subsidiaries in the US
          "is_SP500") # binary for S&P 500 membership

matching <- matching %>% 
  select(ticker_symbol, gvkey, conm, FCPA_sample, all_of(vars)) %>% 
  na.omit()
length(unique(matching$ticker_symbol)) # 3,648 candidates, made of:
length(unique(matching$ticker_symbol[matching$FCPA_sample == 1])) # 240 treated firms
length(unique(matching$ticker_symbol[matching$FCPA_sample == 0])) # 3,408 potential placebo firms

original <- bal.tab(FCPA_sample ~ as.factor(exchg) + as.factor(naics2) + atq + avg_dec + cshoq + n_unique_iso2 + n_subsid_nonUS + n_subsid_US + n_subsid + is_SP500,
                    data = matching,
                    s.d.denom = "pooled", binary = 'std', un = TRUE, m.threshold = 0.25)
original

### Propensity score matching ----
prop.m <- matchit(FCPA_sample ~ as.factor(exchg) + as.factor(naics2) + atq + avg_dec + cshoq + n_unique_iso2 + n_subsid_nonUS + is_SP500,
                  data = matching,
                  method = "nearest",
                  ratio = 1,
                  discard = "control",
                  replace = FALSE) 
matched.data.prop <- prop.m %>%
  match.data()
table(matched.data.prop$FCPA_sample) # 1:1 match (240 firms)

balance <- bal.tab(FCPA_sample ~ as.factor(exchg) + as.factor(naics2) + atq + avg_dec + cshoq + n_unique_iso2 + n_subsid_nonUS + n_subsid_US + n_subsid + is_SP500,
                   data = matched.data.prop, binary = "std",
                   s.d.denom = "pooled", un = TRUE, m.threshold = 0.25)
balance
original

matched.prop <- stocks %>%
  filter(ticker_symbol %in% matched.data.prop$ticker_symbol[matched.data.prop$FCPA_sample == 0])

length(unique(matched.prop$gvkey)) # 240 firms
length(unique(matched.prop$conm)) # 240 company names
length(unique(matched.prop$ticker_symbol)) # associated with 240 securities

### Coarsened exact matching ----
cem.m <- matchit(FCPA_sample ~ as.factor(exchg) + as.factor(naics2) + atq + avg_dec + cshoq + n_unique_iso2 + n_subsid_nonUS + is_SP500,
                 data = matching,
                 method = "cem",
                 ratio = 1,
                 k2k = TRUE,
                 replace = FALSE) 
matched.data.cem <- cem.m %>%
  match.data()

balance <- bal.tab(FCPA_sample ~ as.factor(exchg) + as.factor(naics2) + atq + avg_dec + cshoq + n_unique_iso2 + n_subsid_nonUS + n_subsid_US + n_subsid + is_SP500,
                   data = matched.data.cem, binary = "std",
                   s.d.denom = "pooled", un = TRUE, m.threshold = 0.25)
balance # only one slighlty imbalanced (atq) but it's quite close to the threshold anyway
original

matched.cem <- stocks %>%
  filter(ticker_symbol %in% matched.data.cem$ticker_symbol[matched.data.cem$FCPA_sample == 0])

### Entropy balancing matching ----
ebal.w <- weightit(FCPA_sample ~ as.factor(exchg) + as.factor(naics2) + atq + avg_dec + cshoq + n_unique_iso2 + n_subsid_nonUS + n_subsid_US + n_subsid + is_SP500,
                   data = matching,
                   method = "ebal", estimand = "ATT") 

balance <- bal.tab(ebal.w, 
                   covs = matching %>%
                     mutate(exchg = as.factor(exchg),
                            naics2 = as.factor(naics2)) %>%
                     select(exchg, naics2, atq, avg_dec, cshoq, n_unique_iso2, n_subsid_US, n_subsid_nonUS, n_subsid, is_SP500), 
                   binary = "std",
                   s.d.denom = "pooled", un = TRUE, m.threshold = 0.25)

balance # extremely good balance
original

matched.entropy <- stocks %>%
  inner_join(data.frame(ticker_symbol = matching$ticker_symbol,
                        entropy_weight = ebal.w$weights,
                        FCPA_sample = matching$FCPA_sample) %>%
               filter(FCPA_sample == 0)) %>%
  filter(entropy_weight > 0) %>%
  select(-FCPA_sample)

### Export matching results to report ----
list(whole_pool = matching,
     prop_score = matched.data.prop,
     coar_ex_ma = matched.data.cem,
     entropy_ba = ebal.w) %>%
  write_rds("data_out/matching.rds")

# Merge data ----
## Add covariates ----
sample <- sample %>%
  left_join(sample_covars %>%
              mutate(FCPA_sample = 1) %>%
              rbind(match_candidates_covars %>%
                      mutate(FCPA_sample = 0)) %>%
              select(gvkey, all_of(vars)) %>%
              select(-exchg),
            by = c("gvkey"))

# add covariates to the matched samples, too
# our default is propensity score. We use CEM and entropy balancing as robustness checks
matched.prop <- matched.prop %>%
  left_join(sample_covars %>%
              mutate(FCPA_sample = 1) %>%
              rbind(match_candidates_covars %>%
                      mutate(FCPA_sample = 0)) %>%
              select(gvkey, all_of(vars)) %>%
              select(-exchg),
            by = c("gvkey"))

matched.cem <- matched.cem %>%
  left_join(sample_covars %>%
              mutate(FCPA_sample = 1) %>%
              rbind(match_candidates_covars %>%
                      mutate(FCPA_sample = 0)) %>%
              select(gvkey, all_of(vars)) %>%
              select(-exchg),
            by = c("gvkey"))

matched.entropy <- matched.entropy %>% 
  left_join(sample_covars %>%
              mutate(FCPA_sample = 1) %>%
              rbind(match_candidates_covars %>%
                      mutate(FCPA_sample = 0)) %>%
              select(gvkey, all_of(vars)) %>%
              select(-exchg),
            by = c("gvkey"))

# verify coding of subsidiary numbers was correct:
sample %>%
  filter(n_subsid != n_subsid_nonUS + n_subsid_US) # none, correct

matched.prop %>%
  filter(n_subsid != n_subsid_nonUS + n_subsid_US) # none, correct

matched.cem %>%
  filter(n_subsid != n_subsid_nonUS + n_subsid_US) # none, correct

matched.entropy %>%
  filter(n_subsid != n_subsid_nonUS + n_subsid_US) # none, correct

# Prepare datasets for export ----
## Predictive covariates (individual S&P 500 constituents and composite index) ----
# remove from S&P500 baseline the treated firms:
cov <- sp500 %>%
  filter(! ticker_symbol %in% fcpa$ticker_symbol) %>%
  select(date, ticker_symbol, chg) %>%
  arrange(ticker_symbol, date)

# add S&P 500 composite index so that it stays last:
cov <- cov %>%
  bind_rows(sp500_index %>%
              rename("ticker_symbol" = "index"))

# pivot wider (they'll serve as predictive covariates):
covars_w <- cov %>%
  pivot_wider(names_from = "ticker_symbol",
              values_from = "chg")

colnames(covars_w) # alright

### Export data ----
write_rds(covars_w, "data_out/predictive_SP500.rds")

## FCPA sample ----
summary(sample$n_subsid_nonUS) # non-negative, ok
summary(sample$n_subsid_US) # non-negative, ok
summary(sample$n_subsid) # non-negative, ok

length(unique(sample$ticker_symbol)) # 262 firms

unique(sample$gvkey %in% matched.prop$gvkey) # FALSE, correct
unique(sample$ticker_symbol %in% matched.prop$ticker_symbol) # FALSE, correct

unique(matched.prop$gvkey %in% sample$gvkey) # FALSE, correct
unique(matched.prop$ticker_symbol %in% sample$ticker_symbol) # FALSE, correct


unique(sample$gvkey %in% matched.cem$gvkey) # FALSE, correct
unique(sample$ticker_symbol %in% matched.cem$ticker_symbol) # FALSE, correct

unique(matched.cem$gvkey %in% sample$gvkey) # FALSE, correct
unique(matched.cem$ticker_symbol %in% sample$ticker_symbol) # FALSE, correct


unique(sample$gvkey %in% matched.entropy$gvkey) # FALSE, correct
unique(sample$ticker_symbol %in% matched.entropy$ticker_symbol) # FALSE, correct

unique(matched.entropy$gvkey %in% sample$gvkey) # FALSE, correct
unique(matched.entropy$ticker_symbol %in% sample$ticker_symbol) # FALSE, correct

# so, there's no overlap between the past FCPA targets and any of the matched groups 

# add information from the FCPA dataframe:
sample <- sample %>%
  left_join(fcpa %>%
              group_by(ticker_symbol) %>%
              reframe(enforcement_action = ifelse(any(enforcement_action == 1), 1, 0),
                      sanction_sum = sum(total_sanctions, na.rm = TRUE),
                      sanction_avg = mean(total_sanctions, na.rm = TRUE),
                      ongoing = as.numeric(any(ongoing == 1)),
                      latest_year = max(latest_year, na.rm = TRUE),
                      earliest_year = min(earliest_year, na.rm = TRUE),
                      no_year = max(no_year, na.rm = TRUE),
                      ticker_change = ifelse(any(ticker_change == 1), 1, 0),
                      tot_actions = sum(tot_actions)) %>%
              mutate(across(c("sanction_sum", "sanction_avg"),
                            .fns = ~ifelse(is.nan(.x), NA, .x)),
                     across(.cols = c("latest_year", "earliest_year"),
                            .fns = ~ifelse(is.infinite(.x), NA, .x))),
            by = "ticker_symbol")

## FCPA placebo: Propensity score (main) ----
# add variables relative to FCPA (set to 0s or NAs):
matched.prop <- matched.prop %>%
  mutate(enforcement_action = 0, 
         sanction_sum = NA, 
         sanction_avg = NA,
         ongoing = NA,
         latest_year = NA,
         earliest_year = NA,
         no_year = NA,
         ticker_change = NA,
         tot_actions = NA)

# notice, also, that the number of columns of sample and placebo are the same.
ncol(matched.prop) == ncol(sample) # TRUE
setdiff(colnames(matched.prop), colnames(sample)) # none
setdiff(colnames(sample), colnames(matched.prop)) # none

### Join the two ----
unique(sample$ticker_symbol %in% matched.prop$ticker_symbol) # correct
unique(matched.prop$ticker_symbol %in% sample$ticker_symbol) # correct
unique(sample$gvkey %in% matched.prop$gvkey) # correct
unique(matched.prop$gvkey %in% sample$gvkey) # correct

data <- sample %>%
  mutate(FCPA_sample = 1) %>%
  rbind(matched.prop %>%
          mutate(FCPA_sample = 0)) %>%
  relocate(FCPA_sample, .after = ticker_symbol)

data <- data %>%
  # make shares units:
  mutate(cshoq = cshoq*10^6) %>%
  relocate(naics2, .after = conm) %>%
  mutate(matched = as.numeric(ticker_symbol %in% matched.data.prop$ticker_symbol)) %>%
  relocate(matched, .after = FCPA_sample)

### Export data ----
write_rds(data, "data_out/stocks_clean.rds")

## FCPA placebo: Coarsened exact matching ----
# add variables relative to FCPA (set to 0s or NAs):
matched.cem <- matched.cem %>%
  mutate(enforcement_action = 0, 
         sanction_sum = NA, 
         sanction_avg = NA,
         ongoing = NA,
         latest_year = NA,
         earliest_year = NA,
         no_year = NA,
         ticker_change = NA,
         tot_actions = NA)

### Join the two ----
data <- sample %>%
  mutate(FCPA_sample = 1) %>%
  rbind(matched.cem %>%
          mutate(FCPA_sample = 0)) %>%
  relocate(FCPA_sample, .after = ticker_symbol)

data <- data %>%
  mutate(matched = as.numeric(ticker_symbol %in% matched.data.cem$ticker_symbol)) %>%
  relocate(matched, .after = FCPA_sample) %>%
  relocate(naics2, .after = conm) %>%
  # make shares units:
  mutate(cshoq = cshoq*10^6)

### Export data ----
write_rds(data, "data_out/stocks_clean_CEM.rds")

## FCPA placebo: Entropy balancing ----
# add variables relative to FCPA (set to 0s or NAs):
matched.entropy <- matched.entropy %>%
  mutate(enforcement_action = 0, 
         sanction_sum = NA, 
         sanction_avg = NA,
         ongoing = NA,
         latest_year = NA,
         earliest_year = NA,
         no_year = NA,
         ticker_change = NA,
         tot_actions = NA)

### Join the two ----
data <- sample %>%
  mutate(FCPA_sample = 1) %>%
  # add weights for matched firms:
  left_join(data.frame(entropy_weight = ebal.w$weights,
                       ticker_symbol = matching$ticker_symbol,
                       FCPA_sample = matching$FCPA_sample)) %>%
  rbind(matched.entropy %>%
          mutate(FCPA_sample = 0)) %>%
  relocate(FCPA_sample, .after = ticker_symbol)
table(!is.na(data$entropy_weight), data$FCPA_sample)
# it's correct. The FCPA targets that haven't been added to the entropy balancing 
# (due to NAs in covariates) do not have a weight. The placebo firms all have a weight because
# only the ones with covariates have been included

data <- data %>%
  mutate(matched = as.numeric(ticker_symbol %in% matching$ticker_symbol)) %>%
  relocate(matched, entropy_weight, .after = FCPA_sample) %>%
  relocate(naics2, .after = conm) %>%
  # make shares units:
  mutate(cshoq = cshoq*10^6)

### Export data ----
write_rds(data, "data_out/stocks_clean_entropy.rds")

#====# The End #====#